Setup and Data Import

Loading Required Libraries

packages <- c(
    "tidyverse", "ggplot2", "dplyr", "readr", "lme4",
    "sjPlot", "gridExtra", "viridis", "plotly",
    "knitr", "kableExtra", "readxl", "tidyr"
)

# Installing missing packages
install_if_missing <- function(packages) {
    new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
    if (length(new_packages)) {
        install.packages(new_packages)
    }
}


install_if_missing(packages)

# Loading all packages
lapply(packages, library, character.only = TRUE)
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[5]]
##  [1] "lme4"      "Matrix"    "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[6]]
##  [1] "sjPlot"    "lme4"      "Matrix"    "lubridate" "forcats"   "stringr"  
##  [7] "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"  
## [13] "tidyverse" "stats"     "graphics"  "grDevices" "utils"     "datasets" 
## [19] "methods"   "base"     
## 
## [[7]]
##  [1] "gridExtra" "sjPlot"    "lme4"      "Matrix"    "lubridate" "forcats"  
##  [7] "stringr"   "dplyr"     "purrr"     "readr"     "tidyr"     "tibble"   
## [13] "ggplot2"   "tidyverse" "stats"     "graphics"  "grDevices" "utils"    
## [19] "datasets"  "methods"   "base"     
## 
## [[8]]
##  [1] "viridis"     "viridisLite" "gridExtra"   "sjPlot"      "lme4"       
##  [6] "Matrix"      "lubridate"   "forcats"     "stringr"     "dplyr"      
## [11] "purrr"       "readr"       "tidyr"       "tibble"      "ggplot2"    
## [16] "tidyverse"   "stats"       "graphics"    "grDevices"   "utils"      
## [21] "datasets"    "methods"     "base"       
## 
## [[9]]
##  [1] "plotly"      "viridis"     "viridisLite" "gridExtra"   "sjPlot"     
##  [6] "lme4"        "Matrix"      "lubridate"   "forcats"     "stringr"    
## [11] "dplyr"       "purrr"       "readr"       "tidyr"       "tibble"     
## [16] "ggplot2"     "tidyverse"   "stats"       "graphics"    "grDevices"  
## [21] "utils"       "datasets"    "methods"     "base"       
## 
## [[10]]
##  [1] "knitr"       "plotly"      "viridis"     "viridisLite" "gridExtra"  
##  [6] "sjPlot"      "lme4"        "Matrix"      "lubridate"   "forcats"    
## [11] "stringr"     "dplyr"       "purrr"       "readr"       "tidyr"      
## [16] "tibble"      "ggplot2"     "tidyverse"   "stats"       "graphics"   
## [21] "grDevices"   "utils"       "datasets"    "methods"     "base"       
## 
## [[11]]
##  [1] "kableExtra"  "knitr"       "plotly"      "viridis"     "viridisLite"
##  [6] "gridExtra"   "sjPlot"      "lme4"        "Matrix"      "lubridate"  
## [11] "forcats"     "stringr"     "dplyr"       "purrr"       "readr"      
## [16] "tidyr"       "tibble"      "ggplot2"     "tidyverse"   "stats"      
## [21] "graphics"    "grDevices"   "utils"       "datasets"    "methods"    
## [26] "base"       
## 
## [[12]]
##  [1] "readxl"      "kableExtra"  "knitr"       "plotly"      "viridis"    
##  [6] "viridisLite" "gridExtra"   "sjPlot"      "lme4"        "Matrix"     
## [11] "lubridate"   "forcats"     "stringr"     "dplyr"       "purrr"      
## [16] "readr"       "tidyr"       "tibble"      "ggplot2"     "tidyverse"  
## [21] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
## [26] "methods"     "base"       
## 
## [[13]]
##  [1] "readxl"      "kableExtra"  "knitr"       "plotly"      "viridis"    
##  [6] "viridisLite" "gridExtra"   "sjPlot"      "lme4"        "Matrix"     
## [11] "lubridate"   "forcats"     "stringr"     "dplyr"       "purrr"      
## [16] "readr"       "tidyr"       "tibble"      "ggplot2"     "tidyverse"  
## [21] "stats"       "graphics"    "grDevices"   "utils"       "datasets"   
## [26] "methods"     "base"

Data Import and Cleaning

hiv_data_path <- "C:/Users/PC/Desktop/Internship_task_CEMA/internship_task_dscience-main/HIV data 2000-2023.csv"
poverty_data_path <- "C:/Users/PC/Desktop/Internship_task_CEMA/internship_task_dscience-main/multidimensional_poverty.xlsx"

hiv_data <- read_tsv(hiv_data_path)

poverty_data <- read_excel(poverty_data_path, skip = 2)

glimpse(hiv_data)
## Rows: 1,552
## Columns: 1
## $ `IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,Value` <chr> …
glimpse(poverty_data)
## Rows: 110
## Columns: 16
## $ ...1                         <chr> "SSA", "ECA", "LAC", "ECA", "EAP", "ECA",…
## $ ...2                         <chr> "AGO", "ALB", "ARG", "ARM", "AUS", "AUT",…
## $ ...3                         <chr> "Angola", "Albania", "Argentina", "Armeni…
## $ ...4                         <dbl> 2018, 2012, 2010, 2010, 2010, 2009, 2013,…
## $ ...5                         <chr> "IDREA", "HBS", "EPHC-S2", "ILCS", "SIH-L…
## $ ...6                         <dbl> 2018, 2018, 2021, 2021, 2018, 2022, 2020,…
## $ ...7                         <chr> "N", "N", "U", "N", "N", "N", "N", "N", "…
## $ ...8                         <chr> "c", "c", "i", "c", "I", "i", "c", "i", "…
## $ ...9                         <dbl> 2, 1, 3, 1, 3, 2, 1, 2, 1, 3, 2, 5, 1, 5,…
## $ `Monetary (%)`               <dbl> 31.122004986, 0.048107048, 0.894217938, 0…
## $ `Educational attainment (%)` <chr> "29.7534227371215", "0.192380091175436", …
## $ `Educational enrollment (%)` <chr> "27.443060278892499", "-", "0.73135080747…
## $ `Electricity (%)`            <chr> "52.639532089233398", "6.0250010574236498…
## $ `Sanitation (%)`             <chr> "53.637516498565596", "6.5797723829746202…
## $ `Drinking water (%)`         <chr> "32.106507280141003", "9.5949657452627903…
## $ ...16                        <dbl> 47.20360637, 0.29316142, 0.90657296, 0.52…

Cleaning HIV Data

Let’s clean and prepare the HIV dataset:

hiv_data_clean <- hiv_data %>%
  # Spliting the single column into multiple columns based on commas
  separate(`IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,Value`,
           into = c("indicator_code", "indicator", "value_type", "parent_location_code", 
                    "parent_location", "location_type", "spatial_dim_value_code", "country", 
                    "period_type", "year", "value"),
           sep = ",", extra = "drop", fill = "right") %>%
  
  # Cleaning up invalid UTF-8 characters
  mutate(across(everything(), ~ stringi::stri_enc_toascii(.))) %>%
  
  # Extracting the numeric values from the 'value' column
  mutate(
    hiv_cases = as.numeric(gsub(" .*", "", value)),
    
    # Converting "No data" and similar entries to NA
    hiv_cases = ifelse(grepl("No data|<", value), NA, hiv_cases),
    
    # Converting to thousands for easier analysis
    hiv_cases = hiv_cases * 1000
  ) %>%
  
  # Filtering out rows with NA values for HIV cases
  filter(!is.na(hiv_cases)) %>%
  
  # Selecting relevant columns
  select(
    country = country,
    year = year,
    hiv_cases,
    who_region = parent_location_code
  ) %>%
  
  # Converting years to numeric
  
  mutate(year = as.numeric(year)) %>%
  # Filtering data for the years of interest (2000-2023)
  filter(year >= 2000 & year <= 2023)

# Displays the structure of the cleaned data
print("Structure of cleaned HIV data:")
## [1] "Structure of cleaned HIV data:"
glimpse(hiv_data_clean)
## Rows: 1,065
## Columns: 4
## $ country    <chr> "Angola", "Angola", "Angola", "Angola", "Angola", "Angola",…
## $ year       <dbl> 2023, 2022, 2021, 2020, 2015, 2010, 2005, 2000, 2023, 2022,…
## $ hiv_cases  <dbl> 320000, 320000, 320000, 320000, 300000, 250000, 190000, 140…
## $ who_region <chr> "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AFR", "AF…
# Displays first few rows of the cleaned data
print("First few rows of cleaned HIV data:")
## [1] "First few rows of cleaned HIV data:"
head(hiv_data_clean)
## # A tibble: 6 Ă— 4
##   country  year hiv_cases who_region
##   <chr>   <dbl>     <dbl> <chr>     
## 1 Angola   2023    320000 AFR       
## 2 Angola   2022    320000 AFR       
## 3 Angola   2021    320000 AFR       
## 4 Angola   2020    320000 AFR       
## 5 Angola   2015    300000 AFR       
## 6 Angola   2010    250000 AFR

Cleaning Poverty Data

poverty_data_clean <- poverty_data %>%
  # Select and rename columns using the numbered system
  select(
    region = `...1`,
    country_code = `...2`,
    country = `...3`,
    reporting_year = `...4`,
    survey_name = `...5`,
    survey_year = `...6`,
    survey_coverage = `...7`,
    welfare_type = `...8`,
    survey_comparability = `...9`,
    monetary = `Monetary (%)`,
    education_attainment = `Educational attainment (%)`,
    school_enrollment = `Educational enrollment (%)`,
    electricity = `Electricity (%)`,
    sanitation = `Sanitation (%)`,
    drinking_water = `Drinking water (%)`,
    poverty_ratio = `...16`
  ) %>%
  
  # Convert "-" to NA, then convert to numeric
  mutate(
    across(c(reporting_year, survey_year), as.numeric),
    across(c(monetary, education_attainment, school_enrollment,
             electricity, sanitation, drinking_water, poverty_ratio),
           ~ifelse(. == "-", NA, as.numeric(.)))
  )

# Displays the structure of the cleaned data
glimpse(poverty_data_clean)
## Rows: 110
## Columns: 16
## $ region               <chr> "SSA", "ECA", "LAC", "ECA", "EAP", "ECA", "SSA", …
## $ country_code         <chr> "AGO", "ALB", "ARG", "ARM", "AUS", "AUT", "BDI", …
## $ country              <chr> "Angola", "Albania", "Argentina", "Armenia", "Aus…
## $ reporting_year       <dbl> 2018, 2012, 2010, 2010, 2010, 2009, 2013, 2009, 2…
## $ survey_name          <chr> "IDREA", "HBS", "EPHC-S2", "ILCS", "SIH-LIS", "EU…
## $ survey_year          <dbl> 2018, 2018, 2021, 2021, 2018, 2022, 2020, 2022, 2…
## $ survey_coverage      <chr> "N", "N", "U", "N", "N", "N", "N", "N", "N", "N",…
## $ welfare_type         <chr> "c", "c", "i", "c", "I", "i", "c", "i", "c", "c",…
## $ survey_comparability <dbl> 2, 1, 3, 1, 3, 2, 1, 2, 1, 3, 2, 5, 1, 5, 0, 1, 3…
## $ monetary             <dbl> 31.122004986, 0.048107048, 0.894217938, 0.5235208…
## $ education_attainment <dbl> 29.75342274, 0.19238009, 1.08531965, 0.00000000, …
## $ school_enrollment    <dbl> 27.4430603, NA, 0.7313508, 1.7930038, NA, NA, 34.…
## $ electricity          <dbl> 52.63953209, 0.06025001, 0.00000000, 0.00000000, …
## $ sanitation           <dbl> 53.6375165, 6.5797724, 0.2574530, 0.3977255, 0.00…
## $ drinking_water       <dbl> 32.1065073, 9.5949657, 0.3640480, 0.6600822, NA, …
## $ poverty_ratio        <dbl> 47.20360637, 0.29316142, 0.90657296, 0.52352082, …

Global HIV Burden Analysis

Countries Contributing to 75% of the Global Burden

Now, let’s identify the countries that contribute to 75% of the global HIV burden:

# To calculate total HIV cases per country (across all available years)
country_totals <- hiv_data_clean %>%
  # Use the most recent year for each country for better representation
  group_by(country) %>%
  filter(year == max(year)) %>%
  summarize(total_hiv_cases = sum(hiv_cases, na.rm = TRUE)) %>%
  arrange(desc(total_hiv_cases))

# Calculate global total HIV cases
global_total <- sum(country_totals$total_hiv_cases)

# Calculate cumulative percentage of global burden
country_totals <- country_totals %>%
  mutate(
    percentage = total_hiv_cases / global_total * 100,
    cum_percentage = cumsum(percentage)
  )

# Identify countries contributing to 75% of global burden
high_burden_countries <- country_totals %>%
  filter(cum_percentage <= 75) %>%
  pull(country)

# Display the high burden countries
cat("Number of countries contributing to 75% of global burden:", length(high_burden_countries), "\n")
## Number of countries contributing to 75% of global burden: 22
print(high_burden_countries)
##  [1] "Israel"     "Georgia"    "Ireland"    "Somalia"    "Singapore" 
##  [6] "Tunisia"    "Suriname"   "Djibouti"   "Estonia"    "Latvia"    
## [11] "Libya"      "Denmark"    "Mauritania" "Armenia"    "Sri Lanka" 
## [16] "Bahamas"    "Czechia"    "Cabo Verde" "Serbia"     "Bulgaria"  
## [21] "Belize"     "Lithuania"
# Create a table showing these countries and their burden
high_burden_table <- country_totals %>%
  filter(country %in% high_burden_countries) %>%
  select(country, total_hiv_cases, percentage, cum_percentage) %>%
  mutate(
    percentage = round(percentage, 2),
    cum_percentage = round(cum_percentage, 2),
    total_hiv_cases = format(total_hiv_cases, big.mark = ",")
  )

kable(high_burden_table, 
      col.names = c("Country", "Total HIV Cases", "% of Global Burden", "Cumulative %"),
      caption = "Countries Contributing to 75% of Global HIV Burden") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Countries Contributing to 75% of Global HIV Burden
Country Total HIV Cases % of Global Burden Cumulative %
Israel 9,600,000 5.20 5.20
Georgia 9,000,000 4.88 10.08
Ireland 8,600,000 4.66 14.74
Somalia 8,200,000 4.44 19.18
Singapore 8,100,000 4.39 23.57
Tunisia 8,000,000 4.33 27.90
Suriname 7,400,000 4.01 31.91
Djibouti 7,300,000 3.96 35.87
Estonia 7,300,000 3.96 39.83
Latvia 6,800,000 3.68 43.51
Libya 6,700,000 3.63 47.14
Denmark 6,400,000 3.47 50.61
Mauritania 6,400,000 3.47 54.08
Armenia 6,300,000 3.41 57.49
Sri Lanka 4,700,000 2.55 60.04
Bahamas 4,100,000 2.22 62.26
Czechia 4,100,000 2.22 64.48
Cabo Verde 4,000,000 2.17 66.65
Serbia 4,000,000 2.17 68.81
Bulgaria 3,800,000 2.06 70.87
Belize 3,600,000 1.95 72.82
Lithuania 3,600,000 1.95 74.77

Visualization of HIV Cases Trend in High Burden Countries

Let’s visualize the trend of HIV cases in the high burden countries:

# Filter data for high burden countries
high_burden_data <- hiv_data_clean %>%
  filter(country %in% high_burden_countries)

# Create trend visualization
hiv_trend_plot <- high_burden_data %>%
  ggplot(aes(x = year, y = hiv_cases/1000000, color = country, group = country)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2, alpha = 0.7) +
  scale_y_continuous(labels = scales::comma, name = "People Living with HIV (Millions)") +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(
    title = "HIV Cases Trend in Countries Contributing to 75% of Global Burden",
    x = "Year",
    color = "Country"
  ) +
  theme(
    legend.position = "right",
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    panel.grid.minor = element_blank()
  )

# Display the plot
print(hiv_trend_plot)

# Make it interactive with plotly
interactive_trend <- ggplotly(hiv_trend_plot)
interactive_trend

Visualizing Regional Distribution with Facets

# Create faceted plot by region with countries contributing to 75% of regional burden
regional_trends_data %>%
  ggplot(aes(x = year, y = hiv_cases/1000000, color = country)) +
  geom_line(linewidth = 0.8) +
  geom_point(size = 1.5, alpha = 0.7) +
  facet_wrap(~who_region, scales = "free_y") +
  scale_y_continuous(labels = scales::comma) +
  scale_color_viridis_d() +
  theme_minimal() +
  labs(
    title = "HIV Cases Trend by WHO Region",
    subtitle = "Countries contributing to 75% of regional burden",
    x = "Year",
    y = "People Living with HIV (Millions)"
  ) +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    strip.text = element_text(face = "bold"),
    strip.background = element_rect(fill = "lightgray", color = NA)
  )

## Relationship Between HIV and Multidimensional Poverty

Let’s merge the HIV data with the multidimensional poverty data:

# Merge HIV and poverty data
# Since poverty data's reporting_year may not match exactly with HIV data's year,
# we'll need to make some adjustments

# First, let's get the most recent HIV data for each country
recent_hiv_data <- hiv_data_clean %>%
  group_by(country) %>%
  filter(year == max(year)) %>%
  ungroup()

# Now merge with poverty data
merged_data <- recent_hiv_data %>%
  inner_join(poverty_data_clean, by = "country")

# Check the merged data
cat("Number of countries in merged dataset:", nrow(merged_data), "\n")
## Number of countries in merged dataset: 79
# Check for missing values after merge
missing_values <- merged_data %>%
  summarize(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "variable", values_to = "missing_count") %>%
  filter(missing_count > 0)

print(missing_values)
## # A tibble: 4 Ă— 2
##   variable          missing_count
##   <chr>                     <int>
## 1 school_enrollment            23
## 2 electricity                   1
## 3 sanitation                   16
## 4 drinking_water                6

Analyzing Relationship Between HIV Cases and Poverty Metrics

# Create scatterplot of HIV cases vs. poverty ratio
hiv_poverty_plot <- merged_data %>%
  ggplot(aes(x = poverty_ratio, y = hiv_cases/1000000)) +
  geom_point(aes(color = who_region, size = hiv_cases), alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
  scale_y_continuous(labels = scales::comma) +
  scale_size_continuous(guide = "none") +
  theme_minimal() +
  labs(
    title = "Relationship Between HIV Cases and Multidimensional Poverty",
    x = "Multidimensional Poverty Headcount Ratio (%)",
    y = "People Living with HIV (Millions)",
    color = "WHO Region"
  ) +
  theme(
    legend.position = "right",
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12)
  )

print(hiv_poverty_plot)

# Calculate and save correlations
correlation_analysis <- merged_data %>%
  summarize(
    correlation_poverty_ratio = cor(hiv_cases, poverty_ratio, use = "complete.obs"),
    correlation_monetary = cor(hiv_cases, monetary, use = "complete.obs"),
    correlation_education = cor(hiv_cases, education_attainment, use = "complete.obs"),
    correlation_school = cor(hiv_cases, school_enrollment, use = "complete.obs"),
    correlation_electricity = cor(hiv_cases, electricity, use = "complete.obs"),
    correlation_sanitation = cor(hiv_cases, sanitation, use = "complete.obs"),
    correlation_water = cor(hiv_cases, drinking_water, use = "complete.obs")
  )

# Convert correlations to a more readable format
correlation_tidy <- correlation_analysis %>%
  pivot_longer(
    cols = everything(),
    names_to = "variable",
    values_to = "correlation"
  ) %>%
  mutate(
    variable = gsub("correlation_", "", variable),
    variable = gsub("_", " ", variable),
    variable = stringr::str_to_title(variable)
  )

# Create bar plot of correlations
correlation_plot <- ggplot(correlation_tidy, aes(x = reorder(variable, correlation), y = correlation)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Correlation between HIV Cases and Poverty Factors",
    x = "Poverty Factor",
    y = "Correlation Coefficient"
  ) +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12)
  )

print(correlation_plot)

Mixed Effects Model Analysis

Let’s account for random effects (country, region) in our analysis:

# Prepare data for mixed effects model
model_data <- merged_data %>%
  # Select relevant columns and remove NAs
  select(country, who_region, hiv_cases, 
         poverty_ratio, monetary, education_attainment, 
         school_enrollment, electricity, sanitation, drinking_water) %>%
  filter(complete.cases(.))

# Log-transform HIV cases for better model fit
model_data <- model_data %>%
  mutate(log_hiv_cases = log(hiv_cases + 1))

# Scale variables for better model convergence
model_data_scaled <- model_data %>%
  mutate(across(c(poverty_ratio, monetary, education_attainment, 
                 school_enrollment, electricity, sanitation, drinking_water),
                scale))

# Fit mixed effects model with country and region as random effects
mixed_model <- try(
  lmer(log_hiv_cases ~ 
         poverty_ratio + monetary + education_attainment + 
         school_enrollment + electricity + sanitation + drinking_water +
         (1|who_region), 
       data = model_data_scaled),
  silent = TRUE
)

# Load the package
library(glmmTMB)

# Check if model converged
if(!inherits(mixed_model, "try-error")) {
  # Model summary
  summary_model <- summary(mixed_model)
  print(summary_model)
  
  # Plot the model results
  plot_model(mixed_model, type = "est", sort.est = TRUE) +
    labs(title = "Mixed Effects Model: Factors Affecting HIV Cases",
         subtitle = "Accounting for region random effects")
  
  # Variance explained by random effects
  plot_model(mixed_model, type = "re") +
    labs(title = "Random Effects by WHO Region")
  
  # Check model diagnostics
  plot_model(mixed_model, type = "diag")
} else {
  # If the model fails, try a simpler version
  cat("Full model did not converge. Trying a simpler model.\n")
  
  simple_model <- lm(log_hiv_cases ~ 
                      poverty_ratio + monetary + education_attainment + 
                      school_enrollment + electricity + sanitation + drinking_water,
                    data = model_data_scaled)
  
  summary_simple <- summary(simple_model)
  print(summary_simple)
  
  # Plot coefficients
  plot_model(simple_model, type = "est", sort.est = TRUE) +
    labs(title = "Linear Model: Factors Affecting HIV Cases")
}
## Linear mixed model fit by REML ['lmerMod']
## Formula: log_hiv_cases ~ poverty_ratio + monetary + education_attainment +  
##     school_enrollment + electricity + sanitation + drinking_water +  
##     (1 | who_region)
##    Data: model_data_scaled
## 
## REML criterion at convergence: 223.9
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.89075 -0.67078 -0.02105  0.67203  2.10069 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  who_region (Intercept) 1.213    1.101   
##  Residual               4.467    2.113   
## Number of obs: 54, groups:  who_region, 6
## 
## Fixed effects:
##                      Estimate Std. Error t value
## (Intercept)          11.94256    0.59095  20.209
## poverty_ratio        -1.88010    3.52584  -0.533
## monetary             -0.87840    1.47788  -0.594
## education_attainment  1.27994    0.83832   1.527
## school_enrollment     0.09196    0.80406   0.114
## electricity           1.67694    1.14806   1.461
## sanitation           -0.67444    0.80804  -0.835
## drinking_water       -0.07581    0.66910  -0.113
## 
## Correlation of Fixed Effects:
##             (Intr) pvrty_ montry edctn_ schl_n elctrc santtn
## poverty_rat  0.105                                          
## monetary    -0.056 -0.909                                   
## edctn_ttnmn -0.015 -0.658  0.639                            
## schl_nrllmn -0.082 -0.697  0.675  0.151                     
## electricity -0.102 -0.715  0.461  0.395  0.482              
## sanitation   0.009 -0.268  0.233 -0.040  0.132 -0.085       
## drinkng_wtr  0.039 -0.578  0.538  0.506  0.273  0.348 -0.056
## [[1]]

## 
## [[2]]
## [[2]]$who_region

## 
## 
## [[3]]

## 
## [[4]]

Conclusion

This analysis has explored:

  1. The countries contributing to 75% of the global HIV burden
  2. Trends in HIV cases among these high-burden countries
  3. Regional distribution and trends of HIV cases
  4. The relationship between HIV prevalence and multidimensional poverty

Key findings:

  • Countries contributing to 75% of global HIV cases show varying trends from 2000 to 2020.
  • Most countries exhibit rising HIV cases, with regional differences in the pace of growth.
  • Notably, Europe (EUR) and Eastern Mediterranean (EMR) regions show sharp increases in recent years.

Regional Trends:
- Europe (EUR): Steady upward trends, especially in Israel and Ireland.
- Eastern Mediterranean (EMR): Dramatic rises post-2015, with Somalia and Djibouti spiking sharply.

  • Africa (AFR): Surprisingly lower case numbers despite higher poverty levels.

  • Southeast Asia (SEAR) & Western Pacific (WPR): More moderate, stable growth patterns.

Poverty and HIV Relationship:
- All poverty-related factors show a weak negative correlation with HIV cases (ranging from -0.20 to -0.15).
- Strongest negative links: overall poverty ratio, monetary poverty, sanitation, and electricity access.
- Moderate links: education, school enrollment, and water access.

Insights:
- Countries with better living standards report higher HIV cases.
- Possible reasons:
- Better testing and reporting systems
- Higher survival rates due to advanced healthcare
- Stronger public health surveillance

  • Europe: High HIV rates despite low poverty.
  • Africa: Lower reported cases despite high poverty.
  • This underscores how healthcare quality, reporting systems, and testing access shape the HIV narrative beyond economic status alone.

Summary:
The relationship between poverty and HIV is more complex than expected. Strong healthcare systems and reporting capabilities may explain why wealthier countries show higher HIV numbers. This highlights the need for region-specific strategies that go beyond economic measures, focusing on healthcare access, surveillance, and tailored interventions.

Question 2

Analysis of East African Community Mortality Rates

# Load required libraries
library(tidyverse)
library(sf)
library(rnaturalearth)
library(rnaturalearthdata)
library(viridis)
library(knitr)

# Read the dataset
mortality_data <- read.csv("C:/Users/PC/Downloads/dataset_datascience.csv", 
                           stringsAsFactors = FALSE)

# East African Community countries
eac_countries <- c("Burundi", "Kenya", "Rwanda", "South Sudan", "Tanzania", 
                   "Uganda", "Democratic Republic of the Congo", "Somalia")

Data Preprocessing

# Filter for East African Community countries and included estimates
eac_data <- mortality_data %>%
  filter(Geographic.area %in% eac_countries) %>%
  filter(Observation.Status == "Included in IGME") %>%
  
  # Creates a proper year column from Reference.Date
  mutate(Year = floor(Reference.Date))

# Splits data by indicator
under_five_data <- eac_data %>%
  filter(Indicator == "Under-five mortality rate")

neonatal_data <- eac_data %>%
  filter(Indicator == "Neonatal mortality rate")

# Gets latest data for each country for mapping
latest_under_five <- under_five_data %>%
  group_by(Geographic.area) %>%
  arrange(desc(Year)) %>%
  slice(1) %>%
  ungroup()

latest_neonatal <- neonatal_data %>%
  group_by(Geographic.area) %>%
  arrange(desc(Year)) %>%
  slice(1) %>%
  ungroup()

# Checking for countries with data available
cat("EAC countries with under-five mortality data:", 
    paste(unique(latest_under_five$Geographic.area), collapse=", "), "\n")
## EAC countries with under-five mortality data: Burundi, Democratic Republic of the Congo, Kenya, Rwanda, Somalia, South Sudan, Uganda
cat("EAC countries with neonatal mortality data:", 
    paste(unique(latest_neonatal$Geographic.area), collapse=", "), "\n")
## EAC countries with neonatal mortality data: Burundi, Democratic Republic of the Congo, Kenya, Rwanda, Somalia, South Sudan, Uganda

Getting Map Data

# Get world map data
world <- ne_countries(scale = "medium", returnclass = "sf")

# Filter for East African countries
eac_map <- world %>%
  filter(name %in% eac_countries | sovereignt %in% eac_countries)

# Checking if all countries have been found
missing_countries <- setdiff(eac_countries, 
                            c(eac_map$name, eac_map$sovereignt))
if(length(missing_countries) > 0) {
  warning(paste("Missing countries in map data:", 
                paste(missing_countries, collapse=", ")))
}

# Merge map data with mortality data
# Adjust the joining based on how country names appear in both datasets
eac_under_five_map <- eac_map %>%
  left_join(latest_under_five, by = c("name" = "Geographic.area"))

eac_neonatal_map <- eac_map %>%
  left_join(latest_neonatal, by = c("name" = "Geographic.area"))

# If name doesn't match, try with sovereign name
eac_under_five_map <- eac_under_five_map %>%
  left_join(latest_under_five %>% 
              filter(!(Geographic.area %in% eac_map$name)), 
            by = c("sovereignt" = "Geographic.area"))

eac_neonatal_map <- eac_neonatal_map %>%
  left_join(latest_neonatal %>% 
              filter(!(Geographic.area %in% eac_map$name)), 
            by = c("sovereignt" = "Geographic.area"))

Visualizing Latest Mortality Estimates

# function to plot the maps
plot_mortality_map <- function(data, title, legend_title) {
  ggplot(data) +
    geom_sf(aes(fill = Observation.Value.x), color = "white", size = 0.2) +
    scale_fill_viridis(option = "magma",
                      name = legend_title,
                      direction = -1,
                      na.value = "grey80") +  # Added handling for NA values
    labs(title = title) +
    theme_minimal() +
    theme(
      plot.title = element_text(size = 14, face = "bold"),
      legend.position = "right",
      plot.margin = margin(10, 10, 10, 10)
    )
}

# Plot under-five mortality map
under_five_map <- plot_mortality_map(
  eac_under_five_map,
  "Latest Under-Five Mortality Rate in East African Community",
  "Deaths per 1,000\nlive births"
)

# Plot neonatal mortality map
neonatal_map <- plot_mortality_map(
  eac_neonatal_map,
  "Latest Neonatal Mortality Rate in East African Community",
  "Deaths per 1,000\nlive births"
)

# Combine maps
gridExtra::grid.arrange(under_five_map, neonatal_map, ncol = 2)

Identifying Countries with Highest Mortality Rates

# Identify countries with highest under-five mortality rates
highest_under_five <- latest_under_five %>%
  arrange(desc(Observation.Value)) %>%
  select(Geographic.area, Observation.Value, Year)

# Identify countries with highest neonatal mortality rates
highest_neonatal <- latest_neonatal %>%
  arrange(desc(Observation.Value)) %>%
  select(Geographic.area, Observation.Value, Year)

# Display results
cat("East African countries with highest under-five mortality rates:\n")
## East African countries with highest under-five mortality rates:
kable(highest_under_five, 
      col.names = c("Country", "Under-five mortality rate", "Year of estimate"),
      digits = 1)
Country Under-five mortality rate Year of estimate
Somalia 129.6 2003
Democratic Republic of the Congo 82.3 2014
Burundi 74.2 2013
Uganda 55.9 2015
South Sudan 45.6 2007
Kenya 38.5 2021
Rwanda 34.1 2022
cat("\nEast African countries with highest neonatal mortality rates:\n")
## 
## East African countries with highest neonatal mortality rates:
kable(highest_neonatal, 
      col.names = c("Country", "Neonatal mortality rate", "Year of estimate"),
      digits = 1)
Country Neonatal mortality rate Year of estimate
South Sudan 52.0 2006
Somalia 37.5 2003
Democratic Republic of the Congo 27.5 2010
Uganda 25.0 2013
Burundi 23.7 2013
Kenya 21.1 2019
Rwanda 20.0 2016

Conclusion

The analysis of under-five and neonatal mortality rates across the East African Community reveals significant progress, with a consistent downward trend in mortality rates over time. However, this improvement masks substantial disparities between member states:

Highest mortality rates: South Sudan shows the most concerning figures in recent data, while Somalia and the Democratic Republic of the Congo demonstrated similarly elevated rates in earlier periods. Leading performers: Kenya and Rwanda have achieved markedly lower mortality rates, suggesting effective healthcare policies and interventions that could serve as regional models.

These considerable variations highlight differing levels of healthcare infrastructure, policy effectiveness, and resource allocation across the region. The temporal disparity in latest available data points (ranging from 2003 to 2022) adds complexity to direct comparisons but doesn’t diminish the clear pattern of regional inequality. These findings point to specific opportunities for regional cooperation:

  • Knowledge transfer from better-performing states to those with persistent challenges
  • Targeted interventions in high-mortality areas, particularly South Sudan
  • Standardized data collection systems to ensure consistent monitoring

The overall positive trend is encouraging, but achieving equitable child health outcomes across the EAC will require coordinated regional strategies, continued investment in healthcare systems, and data-driven decision-making. Regular monitoring and evaluation remain essential to track progress and adjust interventions accordingly.